import pandas as pd
import csv
import string
import re
import nltk
nltk.download('stopwords')
nltk.download('names')
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
pd.set_option('display.max_colwidth', 150)
df = pd.read_csv("C:\\Users\\Aruna\\Documents\\input\\Amazon EC2 Full.csv")
df['description'] = df['description'].apply(lambda x: " ".join(x for x in str(x).split())) # converting to string
df.head(10)
df.info()
p = 5100
df['description'][p]
pd.Series(' '.join(df['description']).split()).value_counts()[:30]
print("There are totally", df['description'].apply(lambda x: len(x.split(' '))).sum(), "words before cleaning.")
STOPWORDS = stopwords.words('english')
my_stop_words = ["hi", "hello", "regards", "thank", "thanks", "regard", "best", "wishes", "hey", "amazon", "aws", "s3",
"elastic", "beanstalk", "rds", "ec2", "lambda", "cloudfront", "cloud", "front", "vpc", "sns", "me",
"january", "february", "march", "april", "may", "june", "july", "august", "september", "october",
"november", "december", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov",
"dec", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "mon", "tue",
"wed", "thu", "fri", "sat", "sun", "ain't", "aren't", "can't", "can't've", "'cause", "could've", "couldn't",
"couldn't've", "didn't", "doesn't", "don't", "hadn't", "hadn't've", "hasn't", "haven't", "he'd", "he'd've",
"he'll", "he'll've", "he's", "how'd", "how'd'y", "how'll", "how's", "i'd", "i'd've", "i'll", "i'll've", "i'm",
"i've", "isn't", "it'd", "it'd've", "it'll", "it'll've", "it's", "let's", "mayn't", "might've", "mightn't",
"mightn't've", "must've", "mustn't", "mustn't've", "needn't", "needn't've", "oughtn't", "oughtn't've", "shan't",
"sha'n't", "shan't've", "she'd", "she'd've", "she'll", "she'll've", "she's", "should've", "shouldn't", "shouldn't've",
"so've", "so's", "that'd", "that'd've", "that's", "there'd", "there'd've", "there's", "they'd", "they'd've", "they'll",
"they'll've", "they're", "they've", "to've", "wasn't", "we'd", "we'd've", "we'll", "we'll've", "we're", "we've",
"weren't", "what'll", "what'll've", "what're", "what's", "what've", "when's", "when've", "where'd", "where's",
"where've", "who'll", "who'll've", "who's", "who've", "why's", "why've", "will've", "won't", "won't've", "would've",
"wouldn't", "wouldn't've", "yall", "yalld", "yalldve", "yallre", "yallve", "youd", "youdve", "youll",
"youllve", "youre", "youve", "do", "did", "does", "had", "have", "has", "could", "can", "as", "is",
"shall", "should", "would", "will", "you", "me", "please", "know", "who", "we", "was", "were", "edited", "by", "pm"]
name = names.words()
STOPWORDS.extend(my_stop_words)
STOPWORDS.extend(name)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,:;#+?]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z - _.]+')
REMOVE_HTML_RE = re.compile(r'<.*?>')
REMOVE_HTTP_RE = re.compile(r'http\S+')
STOPWORDS = [BAD_SYMBOLS_RE.sub('', x) for x in STOPWORDS]
df['description'] = df['description'].apply(lambda x: " ".join(x.lower() for x in str(x).split(" ")))
df['description'][p]
df['description'] = df['description'].apply(lambda x: " ".join(REMOVE_HTML_RE.sub(' ', x) for x in str(x).split()))
df['description'][p]
df['description'] = df['description'].apply(lambda x: " ".join(REMOVE_HTTP_RE.sub(' ', x) for x in str(x).split()))
df['description'][p]
df['description'] = df['description'].apply(lambda x: " ".join(REPLACE_BY_SPACE_RE.sub(' ', x) for x in str(x).split()))
df['description'][p]
df['description'] = df['description'].apply(lambda x: " ".join(BAD_SYMBOLS_RE.sub('', x) for x in str(x).split()))
df['description'][p]
df['description'] = df['description'].apply(lambda x: " ".join(x.strip('.') for x in x.split()))
df['description'] = df['description'].apply(lambda x: " ".join(x.strip('-') for x in x.split()))
df['description'] = df['description'].apply(lambda x: " ".join(x.strip('_') for x in x.split()))
df['description'][p]
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
df['description'][p]
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in STOPWORDS
and len(x) > 1))
df['description'][p]
df.head()
pd.Series(' '.join(df['description']).split()).value_counts()[:30]
print("There are totally", df['description'].apply(lambda x: len(x.split(' '))).sum(), "words after cleaning.")
with open('C:\\Users\\Aruna\\Documents\\ACMS-IID\\input\\CleanText.csv', 'a', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
# writer.writerow(['id', 'label', 'description'])
for i in range(0, len(df['description'])):
if len(df['description'][i]) > 1:
writer.writerow([df['id'][i], df['label'][i], df['description'][i]])
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 20, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 50, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 100, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 500, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 1000, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')